library(tidyverse)
library(ggfortify)
library(GGally)
library(skimr)
library(modelr)
wine_quality_red <- read_csv("data/wine_quality_red.csv")
Rows: 1599 Columns: 14── Column specification ────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (1): region
dbl (13): wine_id, fixed_acidity, volatile_acidity, citric_acid, residual_sugar, chlorides, free_sul...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
wine_quality_white <- read_csv("data/wine_quality_white.csv")
Rows: 4898 Columns: 14── Column specification ────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (1): region
dbl (13): wine_id, fixed_acidity, volatile_acidity, citric_acid, residual_sugar, chlorides, free_sul...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(wine_quality_red)
skim(wine_quality_red)
── Data Summary ────────────────────────
Values
Name wine_quality_red
Number of rows 1599
Number of columns 14
_______________________
Column type frequency:
character 1
numeric 13
________________________
Group variables None
wine_quality_red %>%
distinct(region)
head(wine_quality_white)
skim(wine_quality_white)
── Data Summary ────────────────────────
Values
Name wine_quality_white
Number of rows 4898
Number of columns 14
_______________________
Column type frequency:
character 1
numeric 13
________________________
Group variables None
wine_quality_white %>%
distinct(region)
wine_quality_red %>%
ggplot(aes(x = quality)) +
geom_histogram()
wine_quality_white %>%
ggplot(aes(x = quality)) +
geom_histogram()
ggpairs(wine_quality_red %>%
select(-wine_id,
-region))
quality ~ volatile_acidity, sulphates, alcohol
ggpairs(wine_quality_white %>%
select(-wine_id,
-region),
progress = FALSE)
quality ~ alcohol, chlorides, density
model_r1a <- lm(quality ~ volatile_acidity,
data = wine_quality_red)
model_r1b <- lm(quality ~ sulphates,
data = wine_quality_red)
model_r1c <- lm(quality ~ alcohol,
data = wine_quality_red)
model_w1a <- lm(quality ~ alcohol,
data = wine_quality_white)
model_w1b <- lm(quality ~ chlorides,
data = wine_quality_white)
model_w1c <- lm(quality ~ density,
data = wine_quality_white)
summary(model_r1a)
Call:
lm(formula = quality ~ volatile_acidity, data = wine_quality_red)
Residuals:
Min 1Q Median 3Q Max
-2.74440 -0.54813 -0.02734 0.53280 2.77353
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.57413 0.06155 106.81 <2e-16 ***
volatile_acidity -1.72666 0.11044 -15.63 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.7905 on 1597 degrees of freedom
Multiple R-squared: 0.1327, Adjusted R-squared: 0.1322
F-statistic: 244.4 on 1 and 1597 DF, p-value: < 2.2e-16
summary(model_r1b)
Call:
lm(formula = quality ~ sulphates, data = wine_quality_red)
Residuals:
Min 1Q Median 3Q Max
-3.5954 -0.6056 -0.0354 0.5438 2.7696
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.90612 0.08286 59.209 <2e-16 ***
sulphates 1.14965 0.12192 9.429 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.8262 on 1597 degrees of freedom
Multiple R-squared: 0.05274, Adjusted R-squared: 0.05214
F-statistic: 88.91 on 1 and 1597 DF, p-value: < 2.2e-16
summary(model_r1c)
Call:
lm(formula = quality ~ alcohol, data = wine_quality_red)
Residuals:
Min 1Q Median 3Q Max
-2.95517 -0.50166 -0.04501 0.49519 2.70011
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.89570 0.18603 10.19 <2e-16 ***
alcohol 0.36142 0.01776 20.36 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.7564 on 1597 degrees of freedom
Multiple R-squared: 0.206, Adjusted R-squared: 0.2055
F-statistic: 414.4 on 1 and 1597 DF, p-value: < 2.2e-16
alcohol is best for red wrt quality
summary(model_w1a)
Call:
lm(formula = quality ~ alcohol, data = wine_quality_white)
Residuals:
Min 1Q Median 3Q Max
-3.5247 -0.5623 -0.0207 0.5432 3.2832
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.523661 0.104913 24.05 <2e-16 ***
alcohol 0.321456 0.009911 32.44 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.8535 on 4896 degrees of freedom
Multiple R-squared: 0.1769, Adjusted R-squared: 0.1767
F-statistic: 1052 on 1 and 4896 DF, p-value: < 2.2e-16
summary(model_w1b)
Call:
lm(formula = quality ~ chlorides, data = wine_quality_white)
Residuals:
Min 1Q Median 3Q Max
-3.3853 -0.6165 0.0017 0.5394 3.4159
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.30427 0.03055 206.33 <2e-16 ***
chlorides -8.75508 0.60243 -14.53 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.921 on 4896 degrees of freedom
Multiple R-squared: 0.04135, Adjusted R-squared: 0.04116
F-statistic: 211.2 on 1 and 4896 DF, p-value: < 2.2e-16
summary(model_w1c)
Call:
lm(formula = quality ~ density, data = wine_quality_white)
Residuals:
Min 1Q Median 3Q Max
-3.3995 -0.6006 0.0039 0.5651 3.9017
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 98.452 4.267 23.07 <2e-16 ***
density -93.104 4.293 -21.69 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.8985 on 4896 degrees of freedom
Multiple R-squared: 0.08764, Adjusted R-squared: 0.08746
F-statistic: 470.3 on 1 and 4896 DF, p-value: < 2.2e-16
alcohol best for white wrt quality
red_resid <- wine_quality_red %>%
add_residuals(model = model_r1c) %>%
select(-region, -quality, -alcohol)
white_resid <- wine_quality_white %>%
add_residuals(model = model_w1a) %>%
select(-region, -quality, -alcohol)
ggpairs(red_resid,
progress = FALSE)
volatile_acidity, sulphates
ggpairs(white_resid,
progress = FALSE)
volatile_acidity, free_sulfur_dioxide
model_r2a <- lm(formula = quality ~ alcohol + volatile_acidity,
data = wine_quality_red)
model_r2b <- lm(formula = quality ~ alcohol + sulphates,
data = wine_quality_red)
model_w2a <- lm(formula = quality ~ alcohol + volatile_acidity,
data = wine_quality_white)
model_w2b <- lm(formula = quality ~ alcohol + free_sulfur_dioxide,
data = wine_quality_white)
summary(model_r2a)
Call:
lm(formula = quality ~ alcohol + volatile_acidity, data = wine_quality_red)
Residuals:
Min 1Q Median 3Q Max
-2.55192 -0.47030 -0.01778 0.47927 2.41435
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.08357 0.19857 15.53 <2e-16 ***
alcohol 0.31565 0.01723 18.32 <2e-16 ***
volatile_acidity -1.34665 0.10254 -13.13 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.7188 on 1596 degrees of freedom
Multiple R-squared: 0.2834, Adjusted R-squared: 0.2825
F-statistic: 315.7 on 2 and 1596 DF, p-value: < 2.2e-16
summary(model_r2b)
Call:
lm(formula = quality ~ alcohol + sulphates, data = wine_quality_red)
Residuals:
Min 1Q Median 3Q Max
-2.9659 -0.4688 -0.0193 0.4856 2.3045
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.42025 0.19007 7.472 1.29e-13 ***
alcohol 0.34735 0.01744 19.919 < 2e-16 ***
sulphates 0.94527 0.10963 8.623 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.7396 on 1596 degrees of freedom
Multiple R-squared: 0.2413, Adjusted R-squared: 0.2404
F-statistic: 253.9 on 2 and 1596 DF, p-value: < 2.2e-16
volatile_acidity for new red_ model_resid
anova(model_r1c, model_r2a)
Analysis of Variance Table
Model 1: quality ~ alcohol
Model 2: quality ~ alcohol + volatile_acidity
Res.Df RSS Df Sum of Sq F Pr(>F)
1 1597 913.65
2 1596 824.54 1 89.112 172.49 < 2.2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
red_resid <- wine_quality_red %>%
add_residuals(model = model_r2a, "resid") %>%
select(-wine_id, -quality, -alcohol, -volatile_acidity)
summary(model_w2a)
Call:
lm(formula = quality ~ alcohol + volatile_acidity, data = wine_quality_white)
Residuals:
Min 1Q Median 3Q Max
-3.6165 -0.5479 -0.0125 0.5235 3.2676
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.97427 0.10506 28.31 <2e-16 ***
alcohol 0.33282 0.00964 34.52 <2e-16 ***
volatile_acidity -2.04894 0.11770 -17.41 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.8283 on 4895 degrees of freedom
Multiple R-squared: 0.2249, Adjusted R-squared: 0.2245
F-statistic: 710 on 2 and 4895 DF, p-value: < 2.2e-16
summary(model_w2b)
Call:
lm(formula = quality ~ alcohol + free_sulfur_dioxide, data = wine_quality_white)
Residuals:
Min 1Q Median 3Q Max
-4.2055 -0.5615 -0.0305 0.5487 3.3317
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.0726612 0.1167127 17.759 <2e-16 ***
alcohol 0.3432123 0.0101611 33.777 <2e-16 ***
free_sulfur_dioxide 0.0062945 0.0007352 8.561 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.8472 on 4895 degrees of freedom
Multiple R-squared: 0.189, Adjusted R-squared: 0.1887
F-statistic: 570.5 on 2 and 4895 DF, p-value: < 2.2e-16
volatile_acidity
anova(model_w1a, model_w2a)
Analysis of Variance Table
Model 1: quality ~ alcohol
Model 2: quality ~ alcohol + volatile_acidity
Res.Df RSS Df Sum of Sq F Pr(>F)
1 4896 3566.3
2 4895 3358.4 1 207.91 303.04 < 2.2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
white_resid <- wine_quality_white %>%
add_residuals(model_w2a, "white_resid") %>%
select(-wine_id, -quality, -alcohol, -volatile_acidity)
ggpairs(red_resid,
progress = FALSE)
will add sulphates and compare adjr^2
sulphates is significant
summary(model_r3)
Call:
lm(formula = quality ~ alcohol + volatile_acidity + sulphates,
data = wine_quality_red)
Residuals:
Min 1Q Median 3Q Max
-2.66345 -0.45393 -0.01326 0.48807 2.16111
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.62881 0.21133 12.439 < 2e-16 ***
alcohol 0.31134 0.01707 18.241 < 2e-16 ***
volatile_acidity -1.19442 0.10476 -11.401 < 2e-16 ***
sulphates 0.63716 0.10886 5.853 5.84e-09 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.7114 on 1595 degrees of freedom
Multiple R-squared: 0.2985, Adjusted R-squared: 0.2972
F-statistic: 226.2 on 3 and 1595 DF, p-value: < 2.2e-16
adjr^2 goes up so will add
red_resid <- wine_quality_red %>%
add_residuals(model_r3, "red_resid") %>%
select(-wine_id, -quality, -alcohol, -volatile_acidity, -sulphates)
will try residual sugar
summary(model_w3)
Call:
lm(formula = quality ~ alcohol + volatile_acidity + residual_sugar,
data = wine_quality_white)
Residuals:
Min 1Q Median 3Q Max
-3.6037 -0.5389 -0.0167 0.5150 3.1930
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.304082 0.122414 18.82 <2e-16 ***
alcohol 0.383596 0.010722 35.77 <2e-16 ***
volatile_acidity -2.178114 0.117109 -18.60 <2e-16 ***
residual_sugar 0.026953 0.002601 10.36 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.8194 on 4894 degrees of freedom
Multiple R-squared: 0.2415, Adjusted R-squared: 0.241
F-statistic: 519.4 on 3 and 4894 DF, p-value: < 2.2e-16
summary(model_w2a)
Call:
lm(formula = quality ~ alcohol + volatile_acidity, data = wine_quality_white)
Residuals:
Min 1Q Median 3Q Max
-3.6165 -0.5479 -0.0125 0.5235 3.2676
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.97427 0.10506 28.31 <2e-16 ***
alcohol 0.33282 0.00964 34.52 <2e-16 ***
volatile_acidity -2.04894 0.11770 -17.41 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.8283 on 4895 degrees of freedom
Multiple R-squared: 0.2249, Adjusted R-squared: 0.2245
F-statistic: 710 on 2 and 4895 DF, p-value: < 2.2e-16
residual sugar passes
white_resid <- wine_quality_white %>%
add_residuals(model = model_w3, "white_resid") %>%
select(-wine_id, -quality, -alcohol, -volatile_acidity, -resiudal_sugar)
Error in `select()`:
! Can't subset columns that don't exist.
✖ Column `resiudal_sugar` doesn't exist.
Backtrace:
1. ... %>% ...
3. dplyr:::select.data.frame(., -wine_id, -quality, -alcohol, -volatile_acidity, -resiudal_sugar)